import os
import numpy as np
import scipy
import scipy.io
from gensim import corpora, utils
import pandas as pd
from numpy.linalg import norm
import itertools
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.wrappers import DtmModel
import os
from gensim import corpora, utils
from gensim.corpora import Dictionary, bleicorpus
import numpy as np
from gensim.matutils import hellinger, Sparse2Corpus,Scipy2Corpus
import scipy
import scipy.io
import pandas as pd


#This script first averages the topic model output for each newspaper over time,
#Creating an averaged_topic_dist.csv file for each newspaper
#Which is a 20000 x 8 matrix representing an over-time average of the 8 topics
#for each newspaper.

#Next, it creates a shared vocabulary list using the vocabulary files
#of each newspaper. Using the shared vocabulary, we use the 
#averaged topics for each newspaper to find the closest neighbors
#in terms of topics in other newspapers.

folderlist=[folder for folder in os.listdir("newspaper_folders_path_with_vocabulary_files") if os.path.isdir(folder)]
path_to_dtm_binary = 'dynamic_topic_model_binaries_path'


#This first piece of code creates over-time averages of the 8 topics for each newspaper
#and creates a shared vocabulary list of the words shared by all 50 newspapers.
for folder in folderlist:
    path_to_vocab=folder+'/vocablist.txt'
    path_to_corpus=folder+'/bleicorpus_post2010.mtx'
    path_to_model=folder+'/dim_model_2010_8tops.model'
    path_to_times=folder+'/time_slices.csv'
    corpus_dfm=scipy.io.mmread(path_to_corpus)
    gensim_corpus_file=Sparse2Corpus(corpus_dfm,documents_columns=False)
    vocab_list=utils.dict_from_corpus(gensim_corpus_file)
    model_paper=DtmModel.load(path_to_model)
    timelen=len(pd.read_csv(path_to_times))
    vocab_list = {}
    i=1
    with open(path_to_vocab) as f:
        for line in f:
            (key, val) = (i,line)
            vocab_list[int(key)] = val.strip('\n')
            i=i+1
    vocab_list_corp=Dictionary.from_corpus(gensim_corpus_file,id2word=vocab_list)
    mat_topics=model_paper.show_topics(num_topics=8, times=timelen, num_words=len(vocab_list),formatted=False)
    vec_topics=np.zeros([8,len(vocab_list)], dtype=float, order='C')
    for i in range(8):
        for j in range(timelen):
            topic_vector=mat_topics[i+(j*8)]
            for prob,word in topic_vector:
                word=int(word)
                vec_topics[i,word]+=prob    
    vec_topics=(vec_topics/timelen) 
    vec_topics=((vec_topics).transpose())
    vec_words=[vocab_list[num] for num in vocab_list]
    vec_new=np.concatenate((np.array([vec_words]).T,vec_topics),axis=1)
    path_to_save_averaged_topics=folder+'/averaged_topic_dist.csv'
    pd.DataFrame(vec_new).to_csv(path_to_save_averaged_topics,index=False)   
    if folder==folderlist[0]:
        vocab_list = {}
        i=1
        with open(path_to_vocab) as f:
            for line in f:
                (key, val) = (i,line)
                vocab_list[int(key)] = val.strip('\n')
                i=i+1
        entries=set(vocab_list.values())
    else:
        vocab_list_new = {}
        i=1
        with open(path_to_vocab) as f:
            for line in f:
                (key, val) = (i,line)
                vocab_list_new[int(key)] = val.strip('\n')
                i=i+1
        entries_new=set(vocab_list_new.values())
        entries=entries.intersection(entries_new)  

#Write the new topic distributions over the 
#shared vocabulary for each newspaper.
values = {'0': list(entries)}
for folder in folderlist:
    path_to_topics=folder+'/averaged_topic_dist.csv'
    path_to_save=folder+'/topics_combined_vocab.csv'
    bool_existence=os.path.exists(path_to_topics)
    if bool_existence:
        topics=(pd.read_csv(path_to_topics))
        adjusted_topics=topics[topics.isin(values).any(1)]
        adjusted_topics=adjusted_topics.sort_values(by=['0'])
        adjusted_topics.to_csv(path_to_save,index=False)

#Find 8 'clusters' of topics by greedily matching all
#topics with the closest neighbors 
topics = {0: [],1:[],2:[],3:[],4:[],5:[],6:[],7:[]}
for folder in folderlist:
    path_to_topics=folder+'/topics_combined_vocab.csv'
    path_to_save=folder+'/topics_labels.csv'
    path_to_save_adjusted=folder+'/topics_combined_vocab_ordered.csv'
    bool_existence=os.path.exists(path_to_topics)
    if bool_existence:
        if folder==folderlist[0]:
            paper_topics=(pd.read_csv(path_to_topics))
            for i in range(8):
                topics[i].append(np.array(paper_topics[[str(i+1)]]))
            pd.DataFrame(list(range(1, 9))).to_csv(path_to_save_adjusted,index=False)
        else:
            topic_labels=[]
            paper_topics=(pd.read_csv(path_to_topics))
            unmatched=list(range(0, 8))
            order_topics=list(np.random.permutation(8))
            while (len(order_topics)>0):
                curr_top=order_topics.pop()
                curr_top_vector=np.array(paper_topics[[str(curr_top+1)]])
                min_dist=99999.0
                min_top=-1
                for j in unmatched:
                    curr_dist=0.0
                    vecs_tops=topics[j]
                    for vec in vecs_tops:
                        curr_dist+=(np.dot(vec.T, curr_top_vector)/(np.linalg.norm(vec)*np.linalg.norm(curr_top_vector)))[0][0]
                    print('curr_dist:'+str(curr_dist))
                    if curr_dist<min_dist:
                        min_dist=curr_dist
                        min_top=j
                print('min_dist:'+str(min_dist))
                print('min_top:'+str(min_top))
                unmatched.remove(min_top)
                topics[min_top].append(curr_top_vector)
                topic_labels.append(min_top+1)
            pd.DataFrame(topic_labels).to_csv(path_to_save,index=False)
            topic_labels.insert(0,'0')
            paper_topics=paper_topics[[str(elem) for elem in topic_labels]]
            paper_topics.to_csv(path_to_save_adjusted,index=False)
            


